The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
COMPACTNESS (average perim)**2/area
CIRCULARITY (average radius)**2/area
DISTANCE CIRCULARITY area/(av.distance from border)**2
RADIUS RATIO (max.rad-min.rad)/av.radius
PR.AXIS ASPECT RATIO (minor axis)/(major axis)
MAX.LENGTH ASPECT RATIO (length perp. max length)/(max length)
SCATTER RATIO (inertia about minor axis)/(inertia about major axis)
ELONGATEDNESS area/(shrink width)**2
PR.AXIS RECTANGULARITY area/(pr.axis length*pr.axis width)
MAX.LENGTH RECTANGULARITY area/(max.length*length perp. to this)
SCALED VARIANCE (2nd order moment about minor axis)/area ALONG MAJOR AXIS
SCALED VARIANCE (2nd order moment about major axis)/area ALONG MINOR AXIS
SCALED RADIUS OF GYRATION (mavar+mivar)/area
SCALED RADIUS OF GYRATION (mavar+mivar)/area
SKEWNESS ABOUT (3rd order moment about major axis)/sigma_min**3 MAJOR AXIS
SKEWNESS ABOUT (3rd order moment about minor axis)/sigma_maj**3 MINOR AXIS
SKEWNESS ABOUT (3rd order moment about minor axis)/sigma_maj**2 MINOR AXIS
HOLLOWS RATIO (area of hollows)/(area of bounding polygon)
CLASS : Different classes of vehicles.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(color_codes=True)
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
dataset = pd.read_csv("D:/Study/AI-ML/Dataset/vehicle.csv")
dataset.head()
dataset.info() # Total of 846 entries but individual attributes total non-null is not 846. Indicates missing values.
dataset.isnull().sum() # Many attributes contain missing values
dataset.describe().T
dataset['class'] = dataset['class'].astype('category')
data = dataset.drop("class",axis=1)
dataset.plot(kind="box",figsize=(20,10))
# we see that few columns have outliers while few columns do not have outliers.
# Attributes with missing values and outliers are replaced by median
# Attributes with missing values and no outliers are replaced by mean
# Outliers are replaced by the median
# Attributes 4,5,6,11,12,14,15,16 have outliers. Treat missing values with median
# Attributes 1,2,3,7,8,9,10,13,17,18 do not have outliers. Treat missing values with mean.
outlier_attributes = [3,4,5,10,11,13,14,15]
non_outlier_attributes = [0,1,2,6,7,8,9,12,16,17]
for i in dataset.columns[outlier_attributes]:
median = dataset[i].median()
dataset[i] = dataset[i].fillna(median)
for i in dataset.columns[non_outlier_attributes]:
mean = dataset[i].mean()
dataset[i] = dataset[i].fillna(mean)
dataset.isnull().sum()
dataset.columns[:18]
# Handling outliers
for col in dataset.columns[:18]:
q1 = dataset[col].quantile(0.25)
q3 = dataset[col].quantile(0.75)
iqr = q3 - q1
low = q1 - (1.5*iqr)
high = q3 + (1.5*iqr)
dataset.loc[(dataset[col]<low) | (dataset[col] > high),col] = dataset[col].median()
dataset.plot(kind="box",figsize=(20,10))
# Outliers have been removed
dataset.hist(figsize=(15,15))
plt.show()
# Skewness exists among the different attributes.
sns.distplot(dataset['compactness'],hist=False) #Almost normal with a little skew
fig,axes = plt.subplots(nrows=3,ncols=3,figsize=(15,7))
sns.distplot(dataset['circularity'],hist=False,ax = axes[0][0])
sns.distplot(dataset['distance_circularity'],hist=False,ax = axes[0][1])
sns.distplot(dataset['scatter_ratio'],hist=False,ax = axes[0][2])
sns.distplot(dataset['pr.axis_rectangularity'],hist=False,ax = axes[1][0])
sns.distplot(dataset['scaled_variance'],hist=False,ax = axes[1][1])
sns.distplot(dataset['scaled_radius_of_gyration'],hist=False,ax = axes[1][2])
sns.distplot(dataset['skewness_about'],hist=False,ax = axes[2][0])
sns.distplot(dataset['skewness_about.1'],hist=False,ax = axes[2][1])
sns.distplot(dataset['hollows_ratio'],hist=False,ax = axes[2][2])
# Skewness axists between the attributes.
sns.scatterplot(dataset['compactness'],dataset['circularity'],hue=dataset['class'])
# The compactness in the vans range from 82-100 whereas the compactness in the
# other vehicle types have a greater spread.
sns.scatterplot(dataset['compactness'],dataset['distance_circularity'],hue=dataset['class'])
# Buses have a maximum distance circularity of 90 whereas it is greater for other classes of vehicles.
sns.scatterplot(dataset['scatter_ratio'],dataset['circularity'],hue=dataset['class'])
# Moderately strong linear co-relation
sns.scatterplot(dataset['scatter_ratio'],dataset['distance_circularity'],hue=dataset['class'])
# Strong positive linear corelation
fig,axes = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
sns.scatterplot(dataset['circularity'],dataset['elongatedness'],hue=dataset['class'],ax=axes[0][0])
sns.scatterplot(dataset['distance_circularity'],dataset['elongatedness'],hue=dataset['class'],ax=axes[0][1])
sns.scatterplot(dataset['radius_ratio'],dataset['elongatedness'],hue=dataset['class'],ax=axes[1][0])
sns.scatterplot(dataset['scatter_ratio'],dataset['elongatedness'],hue=dataset['class'],ax=axes[1][1])
plt.show()
# Very strog negative co-relation between elongatedness and other attributes such as circularity,scatter_ratio etc.
fig,axes = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
sns.scatterplot(dataset['circularity'],dataset['pr.axis_rectangularity'],hue=dataset['class'],ax=axes[0][0])
sns.scatterplot(dataset['distance_circularity'],dataset['pr.axis_rectangularity'],hue=dataset['class'],ax=axes[0][1])
sns.scatterplot(dataset['scatter_ratio'],dataset['pr.axis_rectangularity'],hue=dataset['class'],ax=axes[1][0])
sns.scatterplot(dataset['elongatedness'],dataset['pr.axis_rectangularity'],hue=dataset['class'],ax=axes[1][1])
plt.show()
# Strong positive co-relation between axis-rectangularity and other attributes such as circularity,scatter_ratio etc.
# Strong negative linear co-relation between axis-rectangularity and elongatedness.
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5))
sns.scatterplot(dataset['scatter_ratio'],dataset['max.length_rectangularity'],hue=dataset['class'],ax=axes[0])
sns.scatterplot(dataset['elongatedness'],dataset['max.length_rectangularity'],hue=dataset['class'],ax=axes[1])
# Vans have least scatter ratio with respect to length rectangularity followed by cars and buses.
# On the other hand, vans have highest elongatedness with respect to length rectangularity followed by cars and buses.
fig,axes = plt.subplots(nrows=3,ncols=2,figsize=(15,15))
sns.scatterplot(dataset['compactness'],dataset['scaled_variance'],hue=dataset['class'],ax=axes[0][0])
sns.scatterplot(dataset['circularity'],dataset['scaled_variance'],hue=dataset['class'],ax=axes[0][1])
sns.scatterplot(dataset['distance_circularity'],dataset['scaled_variance'],hue=dataset['class'],ax=axes[1][0])
sns.scatterplot(dataset['radius_ratio'],dataset['scaled_variance'],hue=dataset['class'],ax=axes[1][1])
sns.scatterplot(dataset['scatter_ratio'],dataset['scaled_variance'],hue=dataset['class'],ax=axes[2][0])
sns.scatterplot(dataset['elongatedness'],dataset['scaled_variance'],hue=dataset['class'],ax=axes[2][1])
plt.show()
# Strong positive co-relation between scaled_variance and other attributes such as circularity,scatter_ratio etc.
# Strong negative linear co-relation between scaled_variance and elongatedness.
sns.pairplot(dataset,diag_kind="kde")
# We see that the attributes are highly linearly related to each other. Either positively or negatively.
sns.pairplot(dataset,diag_kind="kde",hue="class")
# Attributs have a strong linear co-relation with most of the other attributes
# but tend to have a storng linear corelation with elongatedness.
corr = dataset.corr()
plt.figure(figsize=(13,7))
sns.heatmap(corr,annot=True)
#Many attributes have strong negative and strong positive co-relation
# Splitting data on independent and target variables
X = dataset.iloc[:,0:18]
y = dataset.iloc[:,-1]
from sklearn.model_selection import train_test_split # Library for splitting the dataset
from sklearn.metrics import confusion_matrix # Library for Evaluation of result
from sklearn.metrics import classification_report # Library for vewing the classification report
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from scipy.stats import zscore
# Normalizing data
X_z = X.apply(zscore)
X_train, X_test, y_train, y_test = train_test_split(X_z,y,test_size = 0.3, random_state = 10)
classifier_SVM_rbf_kernel = svm.SVC(gamma='auto')
classifier_SVM_rbf_kernel.fit(X_train,y_train)
y_pred_SVM_rbf = classifier_SVM_rbf_kernel.predict(X_test)
svm_score_rbf = classifier_SVM_rbf_kernel.score(X_test,y_test)
svm_score_rbf
svm_cm = confusion_matrix(y_test,y_pred_SVM_rbf)
svm_cm
svm_cr = classification_report(y_test,y_pred_SVM_rbf)
print(svm_cr)
classifier_SVM_linear_kernel = svm.SVC(kernel = "linear",gamma='auto')
classifier_SVM_linear_kernel.fit(X_train,y_train)
y_pred_SVM_linear = classifier_SVM_linear_kernel.predict(X_test)
svm_score_linear = classifier_SVM_linear_kernel.score(X_test,y_test)
svm_score_linear
svm_cm2 = confusion_matrix(y_test,y_pred_SVM_linear)
svm_cm2
svm_cr2 = classification_report(y_test,y_pred_SVM_linear)
print(svm_cr2)
classifier_NB = GaussianNB()
classifier_NB.fit(X_train,y_train)
y_pred_NB = classifier_NB.predict(X_test)
NB_score = classifier_NB.score(X_test,y_test)
NB_score
NB_cm = confusion_matrix(y_test,y_pred_NB)
NB_cm
NB_cr = classification_report(y_test,y_pred_NB)
print(NB_cr)
# Low accuracy and recall for certain classes. Aims at improvising by applying PCA.
from sklearn.decomposition import PCA
pca = PCA()
X_pca = X_z
pca.fit(X_pca)
cum = np.cumsum(pca.explained_variance_ratio_)
sns.pointplot(x=np.arange(1,19),y=cum)
np.cumsum(pca.explained_variance_ratio_)
# First 9 attributes are able to define 98% of the information.
# So the dimensions can be halved from 18 to 9 with a loss of only 2% information.
# First 7 dimensions explains more thn 95% variance.
plt.figure(figsize=(10,6))
sns.barplot(x=np.arange(1,19),y=pca.explained_variance_ratio_,label = "Individual Explained Variance")
plt.step(x = np.arange(1,19),y=cum,where='mid',label = "Cummulative Explained Variance")
plt.ylabel('Explained variance ratio')
plt.xlabel("Principal Components")
plt.legend(loc = 'center right')
plt.show()
pca = PCA(n_components=9)
X_pca = X_z
pca.fit(X_pca)
X_pca = pca.transform(X_pca)
X_pca = pd.DataFrame(X_pca)
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca,y,test_size = 0.3, random_state = 10)
classifier_SVM_rbf_kernel_pca = svm.SVC(gamma='auto')
classifier_SVM_rbf_kernel_pca.fit(X_train_pca,y_train)
y_pred_SVM_rbf_pca = classifier_SVM_rbf_kernel_pca.predict(X_test_pca)
svm_score_rbf_pca = classifier_SVM_rbf_kernel_pca.score(X_test_pca,y_test)
svm_score_rbf_pca
svm_cm_pca = confusion_matrix(y_test,y_pred_SVM_rbf_pca)
svm_cm_pca
svm_cr_pca = classification_report(y_test,y_pred_SVM_rbf_pca)
print(svm_cr_pca)
classifier_SVM_linear_kernel_pca = svm.SVC(kernel = "linear",gamma='auto')
classifier_SVM_linear_kernel_pca.fit(X_train_pca,y_train)
y_pred_SVM_linear_pca = classifier_SVM_linear_kernel_pca.predict(X_test_pca)
svm_score_linear_pca = classifier_SVM_linear_kernel_pca.score(X_test_pca,y_test)
svm_score_linear_pca
svm_cm_pca2 = confusion_matrix(y_test,y_pred_SVM_linear_pca)
svm_cm_pca2
svm_cr_pca2 = classification_report(y_test,y_pred_SVM_linear_pca)
print(svm_cr_pca2)
classifier_NB_pca = GaussianNB()
classifier_NB_pca.fit(X_train_pca,y_train)
y_pred_NB_pca = classifier_NB_pca.predict(X_test_pca)
NB_score_pca = classifier_NB_pca.score(X_test_pca,y_test)
NB_score_pca
NB_cm_pca = confusion_matrix(y_test,y_pred_NB_pca)
NB_cm_pca
NB_cr_pca = classification_report(y_test,y_pred_NB_pca)
print(NB_cr_pca)
SVM_rbf = []
SVM_linear = []
NB = []
SVM_rbf.append(svm_score_rbf)
SVM_linear.append(svm_score_linear)
NB.append(NB_score)
SVM_rbf,SVM_linear,NB
# Based on the above plot, we select the first 7 components as they explain more than 95% of the data
# and progressively see how the model's accuracy changes with increase in principal components.
for n in range(7,19):
X_pca = X_z # Resetting the X value for each iteration
pca = PCA(n_components=n) # Initializing pca with n components for each iteration
pca.fit(X_pca)
X_pca = pca.transform(X_pca)
X_pca = pd.DataFrame(X_pca)
#Splitting the transformed X (X_pca) for building the model
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca,y,test_size = 0.3, random_state = 10)
# Model 1 - Support Vector Machine
## RBF Kernel
classifier_SVM_rbf_kernel_pca = svm.SVC(gamma='auto')
classifier_SVM_rbf_kernel_pca.fit(X_train_pca,y_train)
SVM_rbf.append(classifier_SVM_rbf_kernel_pca.score(X_test_pca,y_test))
## Linear Kernel
classifier_SVM_linear_kernel_pca = svm.SVC(kernel = "linear",gamma='auto')
classifier_SVM_linear_kernel_pca.fit(X_train_pca,y_train)
SVM_linear.append(classifier_SVM_linear_kernel_pca.score(X_test_pca,y_test))
## Naive Bayes
classifier_NB_pca = GaussianNB()
classifier_NB_pca.fit(X_train_pca,y_train)
NB.append(classifier_NB_pca.score(X_test_pca,y_test))
labels = [0,7,8,9,10,11,12,13,14,15,16,17,18]
values_rbf = []
values_linear = []
values_NB = []
for i in range(0,len(SVM_rbf)):
values_rbf.append(SVM_rbf[i])
values_linear.append(SVM_linear[i])
values_NB.append(NB[i])
plt.figure(figsize=(15,8))
sns.pointplot(x = labels,y=values_rbf,color="#33FF3C",label="RBF"),
sns.pointplot(x = labels,y=values_linear,color="#4C33FF",label="Linear"),
sns.pointplot(x = labels,y=values_NB,color="#FF4933",label="Naive Bayes")
leg = plt.legend(labels=['RBF','Linear','Naive Bayes'], loc ='center',prop={'size':16})
leg.legendHandles[0].set_color("#33FF3C")
leg.legendHandles[1].set_color("#4C33FF")
leg.legendHandles[2].set_color("#FF4933")
plt.title("Comparing model accuracy without PCA and with PCA\n")
plt.xlabel("Principal Components")
plt.ylabel("Accuracy")
plt.show()
# If we consider just accuracy, then with 14 components we get the maximum accuracy in all the three models.
# But if we want to consider according to the information content captured by the components,
# then with only 7 components, more than 95% variance is covered. This also gives a good increase
# in accuracy in Naive bayes but a decrease in the SVM Models.
# Therefore, we choose 9 dimensions which shows a stable and good increase in accuracy in all the 3 models.
# This allows us to therefore reduce the number of dimensions from 18 to 9 which is a significant good reduction.
Without Applying PCA
After Applying PCA (7 components)